Part 1 : Descriptive analysis
Unique French names
Evolution over time
#Changing the format of year in national data
data_nat_clean$year <- str_pad(data_nat_clean$year,5,"right")
data_nat_clean$year <- str_replace(data_nat_clean$year," ","-31-12")
data_nat_clean$year <- as.Date(data_nat_clean$year, format="%Y-%d-%m")
#Calculate number of unique names
distinct_names <- data_nat_clean %>%
group_by(year) %>%
distinct(firstname) %>%
summarise(n_names = n())
#Plotting result
plot_distinct_names <-
distinct_names %>%
ggplot(aes(x=year, y=n_names)) +
geom_line(size=1.2, color="blue") +
ggtitle("Number of French unique names from 1900 to 2018") +
theme(plot.title = element_text(hjust = 0.5)) +
xlab("Year") + ylab("Number of unique names")
plot_distinct_names

Difference between one year to another over time
Most popular names for each region
#Creating region dataset
region <-data_dpt_clean %>%
mutate(region=case_when(
department %in% c("95", "78", "91", "77","94","92","93","75") ~ "Ile-de-France",
department %in% c("08","51","10","52") ~ "Champagne-Ardenne",
department %in% c("02","60","80") ~ "Picardie",
department %in% c("76", "27") ~ "Haute-normandie",
department %in% c("18","28","36","37", "41", "45") ~ "Centre",
department %in% c("14","50","61") ~ "Basse-Normandie",
department %in% c("21", "58", "71", "89") ~ "Bourgogne",
department %in% c("59","62") ~ "Nord-Pas-de-Calais",
department %in% c("54","55","57", "88") ~ "Lorraine",
department %in% c("67", "68") ~ "Alsace",
department %in% c("25","39","70","90") ~ "Franche-Comté",
department %in% c("44","49","53", "72","85") ~ "Pays de la Loire",
department %in% c("22", "29", "35", "56") ~ "Bretagne",
department %in% c("16","17","79","86") ~ "Poitou-CharenteS",
department %in% c("24","33","40","47","64") ~ "Aquitaine",
department %in% c("09", "12", "31", "32","46","65","81","82") ~ "Midi-Pyrenées",
department %in% c("19","23","87") ~ "Limousin",
department %in% c("01","07","26","38","42","69","73","74") ~ "Rhone-Alpes",
department %in% c("03", "15", "43", "63") ~ "Auvergne",
department %in% c("11","30","34","48", "66") ~ "Languedoc-Roussillon",
department %in% c("04","05","06","13", "83","84") ~ "PACA",
department %in% c("20") ~ "Corse",
department %in% c("971","972","973","974") ~ "Overseas territories",
))
#Creating df for best name in each region every year
table <- region %>%
group_by(year, sex, region, firstname) %>%
mutate(number = sum(number)) %>%
ungroup() %>%
group_by(year, sex, region) %>%
select(sex, firstname, year, number, region) %>%
unique() %>%
filter(number == max(number))
#Filtering by sex
boys <- table %>% filter(sex==1)
girls <- table %>% filter(sex==2)
Girls names
#Plotting result for girls
girls %>% plot_ly(
x = ~region,
y = ~number,
textposition = 'auto',
frame = ~year,
text = ~firstname,
hoverinfo = "number",
type = 'bar',
marker = list(color = 'rgb(255,192,203)'))%>%
layout (
xaxis = list( title = "French regions"),
yaxis=list(title="Number of births"),
title = list(text = "Evolution of the most popular girl French firstname according to the region from 1900 to 2018",
font=list(size=14)),
showlegend = FALSE) %>%
animation_slider(
currentvalue = list(font = list(color="black")),
pad = list(t=130))
Boys names
#Plotting result for boys
boys %>% plot_ly(
x = ~region,
y = ~number,
textposition = 'auto',
frame = ~year,
text = ~firstname,
hoverinfo = "number",
type = 'bar',
marker = list(color = 'rgb(135,206,235)'))%>%
layout (
xaxis = list( title = "French regions"),
yaxis=list(title="Number of births"),
title = list(text = "Evolution of the most popular boy French firstname according to the region from 1900 to 2018",
font=list(size=14)),
showlegend = FALSE) %>%
animation_slider(
currentvalue = list(font = list(color="black")),
pad = list(t=130))
Partie 2
zinedine <- data_nat_clean # %>% filter(year(year) > 1993 & year(year)<2003)
#zinedine$year <- as.factor(zinedine$year)
zinedine1 <- zinedine %>% filter(firstname == "ZINEDINE" | firstname=="BIXENTE" | firstname=="YOURI")
graph <- ggplot(zinedine1, aes(x = year, y = number, colour = firstname))+
geom_line(size=1.5) +
ggtitle("Plot of number of name by years for football player in 1998") +
xlab("Year") + ylab("Number of name") +
geom_vline(aes (xintercept =as.numeric(as.Date("1998-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1998-01-01")), y=180, label="1998 Football world cup",colour = "black", show.legend=FALSE)
graph

#CM18 <- data_nat_clean %>% filter(year(year) > 2010 )
#CM18 <- CM18 %>% filter(firstname == "ANTOINE" | firstname=="KYLIAN" | firstname=="BLAISE" | firstname=="BENJAMIN" | firstname=="SAMUEL" | firstname=="HUGO")
#graph <- ggplot(CM18, aes(x = year, y = number, colour = firstname))+
# geom_line(size=1.5) + ggtitle("Plot of number of name by years for football player IN 2018") +
#xlab("Year") + ylab("Number of name")
#graph
got <- data_nat_clean %>% filter(firstname=="BRAN" |firstname == "SANSA" |firstname == "DAENERYS")
got <- got %>% filter(year(year) > 2004 )
graph <- ggplot(got, aes(x = year, y = number, colour = firstname))+
geom_line(size=1.5) +
ggtitle("Plot of number of name by years in link with Game of throne") +
xlab("Year") + ylab("Number of name")+
geom_vline(aes (xintercept =as.numeric(as.Date("2011-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("2011-01-01")), y=20, label="Game of throne, season 1",colour = "black", show.legend=FALSE)
graph

arwen <- data_nat_clean %>% filter(firstname == "ARWEN" & sex=="2")
cinema <- data_nat_clean %>% filter(firstname=="NEO" |firstname == "BELLA" |firstname == "ANAKIN")
cinema <- bind_rows(arwen, cinema)
cinema <- cinema%>% filter(year(year) > 1990 )
graph <- ggplot(cinema, aes(x = year, y = number, colour = firstname))+
geom_line(size=1.5) + ggtitle("Plot of number of name by years in link with the cinema") +
xlab("Year") + ylab("Number of name")+
geom_vline(aes (xintercept =as.numeric(as.Date("1999-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1999-01-01")), y=25, label="Matrix",colour = "black", size = 3, show.legend=FALSE)+
geom_vline(aes (xintercept =as.numeric(as.Date("2009-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("2009-01-01")),y=105, label="Twilight",colour = "black", size = 3, show.legend=FALSE)+
geom_label(x=as.numeric(as.Date("1999-01-01")), y=100, label="Star Wars: Episode I ",colour = "black", size = 3, show.legend=FALSE)+
geom_vline(aes (xintercept =as.numeric(as.Date("2001-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("2001-01-01")), y=175, label="The Lord of the rings ",colour = "black", size = 3, show.legend=FALSE)
graph

adolphe <- data_nat_clean %>% filter(firstname == "ADOLPHE" & sex=="1")
adolphe <- adolphe %>% filter(year(year) > 1909 & year(year) < 1951 )
victoire <- data_nat_clean %>% filter(firstname == "VICTOIRE" & sex=="2")
victoire <- victoire %>% filter(year(year) > 1909 & year(year) < 1951 )
ww <- data_nat_clean %>% filter(firstname == "JOFFRE" | firstname == "JOFFRETTE"| firstname == "ADOLPHINE")
ww <- ww%>% filter(year(year) > 1909 & year(year) < 1951 )
ww <- bind_rows(ww, adolphe, victoire)
graph <- ggplot(ww, aes(x = year, y = number, colour = firstname))+
geom_line(size=1) + ggtitle("Plot of number of name by years in link with the world wars") +
xlab("Year") + ylab("Number of name")+
geom_vline(aes (xintercept =as.numeric(as.Date("1914-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1914-01-01")), y=0, label="Marne's Battle",colour = "black", size = 2.5, show.legend=FALSE)+
geom_vline(aes (xintercept =as.numeric(as.Date("1921-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1921-01-01")),y=550, label="Hitler leader of the NSDAP",colour = "black", size = 2.5, show.legend=FALSE)+
geom_vline(aes (xintercept =as.numeric(as.Date("1918-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1918-01-01")), y=500, label="End of the World War I",colour = "black", size = 2.5, show.legend=FALSE)+
geom_vline(aes (xintercept =as.numeric(as.Date("1945-01-01"))), linetype = "longdash")+
geom_label(x=as.numeric(as.Date("1945-01-01")), y=200, label="End of the World II",colour = "black", size = 2.5, show.legend=FALSE)
graph

got <- data_nat_clean %>% filter(firstname=="FELICIE" |firstname == "ROXANNE" |firstname == "OSCAR" |firstname == "LOLITA")
#got <- got %>% filter(year(year) > 2004 )
graph <- ggplot(got, aes(x = year, y = number, colour = firstname))+
geom_line(size=1.5) +
ggtitle("Plot of number of name by years in link with Game of throne") +
xlab("Year") + ylab("Number of name")
#geom_vline(aes (xintercept =as.numeric(as.Date("2011-01-01"))), linetype = "longdash")+
#geom_label(x=as.numeric(as.Date("2011-01-01")), y=20, label="Game of throne, season 1",colour = "black", show.legend=FALSE)
graph

data_dpt_clean$year <- str_pad(data_dpt_clean$year,5,"right")
data_dpt_clean$year <- str_replace(data_dpt_clean$year," ","-31-12")
data_dpt_clean$year <- as.Date(data_dpt_clean$year, format="%Y-%d-%m")
dep <- data_dpt_clean %>% filter(year(year)>1989 & year(year)<2013)
dep <-dep %>% mutate(department=case_when(
department %in% c("95", "78", "91", "77","94","92","93","75") ~ "Ile-de-France",
department %in% c("08","51","10","52") ~ "Champagne-Ardenne",
department %in% c("02","60","80") ~ "Picardie",
department %in% c("76", "27") ~ "Haute-normandie",
department %in% c("18","28","36","37", "41", "45") ~ "Centre",
department %in% c("14","50","61") ~ "Basse-Normandie",
department %in% c("21", "58", "71", "89") ~ "Bourgogne",
department %in% c("59","62") ~ "Nord-Pas-de-Calais",
department %in% c("54","55","57", "88") ~ "Lorraine",
department %in% c("67", "68") ~ "Alsace",
department %in% c("25","39","70","90") ~ "Franche-Comté",
department %in% c("44","49","53", "72","85") ~ "Pays de la Loire",
department %in% c("22", "29", "35", "56") ~ "Bretagne",
department %in% c("16","17","79","86") ~ "Poitou-CharenteS",
department %in% c("24","33","40","47","64") ~ "Aquitaine",
department %in% c("09", "12", "31", "32","46","65","81","82") ~ "Midi-Pyrenées",
department %in% c("19","23","87") ~ "Limousin",
department %in% c("01","07","26","38","42","69","73","74") ~ "Rhone-Alpes",
department %in% c("03", "15", "43", "63") ~ "Auvergne",
department %in% c("11","30","34","48", "66") ~ "Languedoc-Roussillon",
department %in% c("04","05","06","13", "83","84") ~ "PACA ",
department %in% c("20") ~ "Corse",
department %in% c("971","972","973","974") ~ "Overseas territories",
))
dep <- dep %>% rename(Region = department)
eco1990boy <- dep %>% filter(sex==1, year(year)==1990) %>%
group_by(Region) %>%
filter(number ==max(number))
eco1990boy <- merge(eco1990boy, eco, by = "Region")
eco1990girl <- dep %>% filter(sex==2, year(year)==1990) %>%
group_by(Region) %>%
filter(number ==max(number))
eco1990girl <- merge(eco1990girl, eco, by = "Region")
eco1990 <- bind_rows(eco1990boy, eco1990girl)
graph <- ggplot(eco1990boy, aes(x = firstname, y = X1990, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 1990 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph

graph <- ggplot(eco1990girl, aes(x = firstname, y = X1990, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 1990 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph

eco2000boy <- dep %>% filter(sex==1, year(year)==2000) %>%
group_by(Region) %>%
filter(number ==max(number))
eco2000boy <- merge(eco2000boy, eco, by = "Region")
eco2000girl <- dep %>% filter(sex==2, year(year)==2000) %>%
group_by(Region) %>%
filter(number ==max(number))
eco2000girl <- merge(eco2000girl, eco, by = "Region")
eco2000 <- bind_rows(eco2000boy, eco2000girl)
graph <- ggplot(eco2000boy, aes(x = firstname, y = X2000, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 2000 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph

graph <- ggplot(eco2000girl, aes(x = firstname, y = X2000, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 2000 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph

eco2010boy <- dep %>% filter(sex==1, year(year)==2010) %>%
group_by(Region) %>%
filter(number ==max(number))
eco2010boy <- merge(eco2010boy, eco, by = "Region")
eco2010girl <- dep %>% filter(sex==2, year(year)==2010) %>%
group_by(Region) %>%
filter(number ==max(number))
eco2010girl <- merge(eco2010girl, eco, by = "Region")
eco2010 <- bind_rows(eco2010boy, eco2010girl)
graph <- ggplot(eco2010boy, aes(x = firstname, y = X2010, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 2010 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph

graph <- ggplot(eco2010girl, aes(x = firstname, y = X2010, color=Region))+
geom_point(size=1.5) + ggtitle("Plot of more popular boys firstnames in 2010 by Region and GDP") +
xlab("GDP") + ylab("Number of name")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
graph
